Content

A. Identified gene number
B. Identified genes’ biotype
C. Identified transcript number
D. Identified transcripts’ biotype
E. Correlation of different method quantification
F. GO&KEGG enrichment of method-specific genes
Biotype of Library-specific isoforms
Biotype of Library-specific genes
GO&KEGG of Library-specific genes

preparation

suppressPackageStartupMessages(library(data.table))
gtf <- rtracklayer::readGFF("/mnt/raid61/Personal_data/tangchao/Document/gencode/human/GRCh37/gencode.v30lift37.annotation.gtf")
gtf <- rtracklayer::readGFF("/mnt/raid61/Personal_data/tangchao/Document/gencode/human/GRCh37/gencode.v30lift37.annotation.gtf")
setDT(gtf)
gtf <- gtf[type %in% c("gene", "transcript"), .(seqid, type, start, end, strand, gene_id, gene_type, gene_name, transcript_id, transcript_type)]
gtf[, gene_id:=substr(gene_id, 1, 15)]
gtf[, transcript_id:=substr(transcript_id, 1, 15)]

Gene_biotype <- fread("/mnt/raid61/Personal_data/tangchao/ScientificData/data/Annotation/Gene_biotype.csv", header = F, select = 1:2)
setkey(Gene_biotype, V1)

gtf_gene <- gtf[type == "gene", ]
gtf_gene$gene_biotype <- Gene_biotype[gtf_gene$gene_type, V2]
gtf_gene <- unique(gtf_gene[, .(gene_id, gene_biotype)])
setkey(gtf_gene, gene_id)

gtf_tx <- gtf[type == "transcript", ]
gtf_tx$transcript_biotype <- Gene_biotype[gtf_tx$transcript_type, V2]
gtf_tx <- unique(gtf_tx[, .(transcript_id, transcript_biotype)])
setkey(gtf_tx, transcript_id)
# col.p <- lattice::trellis.par.get("superpose.symbol")$col[1] # colour of PolyA RNA
# col.t <- lattice::trellis.par.get("superpose.symbol")$col[2] # colour of Total RNA
col.p <- "#00AFBB"
col.t <- "#E7B800"

SampInfo_PolyA <- fread("/mnt/raid61/Personal_data/tangchao/ScientificData/data/SampleInfo/PolyA_RNA_sampleInfo.txt")
SampInfo_Total <- fread("/mnt/raid61/Personal_data/tangchao/ScientificData/data/SampleInfo/Total_RNA_sampleInfo.txt")

A. Identified gene number

PolyA

setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_PolyA <- paste(SampInfo_PolyA$ID , ".genes.results", sep = "")
stopifnot(all(file.exists(files_PolyA)))
Gene_PolyA <- lapply(files_PolyA, fread, select = c(1, 5, 6))

filtering

Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[, gene_id:=substr(gene_id, 1, 15)]})
PolyA_gene <- unique(substr(do.call(rbind, Gene_PolyA)[[1]], 1, 15))

Total

setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_Total <- paste(SampInfo_Total$ID , ".genes.results", sep = "")
stopifnot(all(file.exists(files_Total)))
Gene_Total <- lapply(files_Total, fread, select = c(1, 5, 6))

filtering

Gene_Total <- lapply(Gene_Total, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_Total <- lapply(Gene_Total, function(x) {x[, gene_id:=substr(gene_id, 1, 15)]})
Total_gene <- unique(substr(do.call(rbind, Gene_Total)[[1]], 1, 15))
Mat <- rbind(data.frame(Gene = mapply(nrow, Gene_PolyA), Library = "PolyA"), data.frame(Gene = mapply(nrow, Gene_Total), Library = "Total"))

library(ggpubr)
my_comparisons = list(c("PolyA", "Total"))
ggviolin(Mat, x = "Library", y = "Gene", fill = "Library",
         palette = c(col.p, col.t),
         add = "boxplot", 
         add.params = list(fill = "white"))+
  stat_compare_means(comparisons = my_comparisons, label = "p.signif", method = "t.test")+ # Add significance levels
  stat_compare_means(method = "t.test", label.x.npc = "center")+
  guides(fill = FALSE)+
  theme(axis.title = element_text(size = 16))+
  labs(y = "No. genes")

ggviolin(Mat, x = "Library", y = "Gene", fill = "Library",
         palette = c(col.p, col.t),
         add = "boxplot", 
         add.params = list(fill = "white"))+
  stat_compare_means(label = "p.format", comparisons = my_comparisons, method = "t.test")+ # Add significance levelsaes(label = paste0("p = ", ..p.format..))
  guides(fill = FALSE)+
  theme(axis.title = element_text(size = 16))+
  labs(y = "No. genes")

input = list(PolyA = PolyA_gene, Total = Total_gene)

lab1 <- paste(sum(!input[[1]] %in% input[[2]]), "\n(", round(mean(!input[[1]] %in% input[[2]]) * 100, 2), "%)", sep = "")
lab2 <- length(intersect(input[[1]], input[[2]]))
lab3 <- paste(sum(!input[[2]] %in% input[[1]]), "\n(", round(mean(!input[[2]] %in% input[[1]]) * 100, 2), "%)", sep = "")

library(ggplot2)
library(ggforce)
library(cowplot)
ggplot() + geom_circle(aes(x0 = c(-1, 1), 
                           y0 = c(0, 0), 
                           r = c(2, 2), 
                           color = c(col.p, col.t), 
                           fill = c(col.p, col.t)), 
                       lwd = 1.5, 
                       alpha = .1)+
  guides(color = F, fill = F, alpha = F)+
  scale_fill_manual(values = c(col.p, col.t)) +
  scale_color_manual(values = c(col.p, col.t)) +
  theme_nothing() + 
  ggplot2::annotate("text", x = c(-2, 0, 2), y = 0, label = c(lab1, lab2, lab3), size = 4) + 
  ggplot2::annotate("text", x = c(-1, 1), y = 2.2, label = names(input), size = 6)

B. Identified genes’ biotype

Total_gene_biotype <- mapply(function(x) table(gtf_gene[x[[1]],]$gene_biotype), Gene_Total)
PolyA_gene_biotype <- mapply(function(x) table(gtf_gene[x[[1]],]$gene_biotype), Gene_PolyA)

Total_gene_biotype <- data.frame(reshape2::melt(Total_gene_biotype))
PolyA_gene_biotype <- data.frame(reshape2::melt(PolyA_gene_biotype))
Total_gene_biotype$Library <- "Total"
PolyA_gene_biotype$Library <- "PolyA"
gene_biotype <- rbind(PolyA_gene_biotype, Total_gene_biotype)
data_summary <- function(data, varname, groupnames){
  require(plyr)
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      sd = sd(x[[col]], na.rm=TRUE))
  }
  data_sum<-ddply(data, groupnames, .fun=summary_func, varname)
  data_sum <- rename(data_sum, c("mean" = varname))
  return(data_sum)
}

df2 <- data_summary(gene_biotype, varname="value", 
                    groupnames=c("Library", "Var1"))
## Loading required package: plyr
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
## 
##     mutate
ggplot(df2, aes(x=Var1, y=value, fill=Library)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_errorbar(aes(ymin=value-sd, ymax=value+sd), width=.2,
                position=position_dodge(.9)) +
  labs(x = "Biotype", y = "No. gene")+
  theme_classic() +
  scale_fill_manual(values=c(col.p, col.t))+
  guides(fill = FALSE)+
  theme(axis.title = element_text(size = 16), 
        axis.text = element_text(size = 12), 
        axis.text.x = element_text(angle = 22, hjust = 1)) -> p

tmp <- with(gene_biotype, by(gene_biotype, Var1, function(x) t.test(value ~ Library, data = x)))
(lab <- signif(sapply(tmp, function(x) x$p.value), 2))
##      Antisense         lncRNA          ncRNA Protein coding     Pseudogene 
##        8.1e-27        5.8e-06        3.4e-29        1.6e-12        5.1e-01
y0 = as.vector(by(df2$value + df2$sd, factor(df2$Var1), max)) + 
  max(as.vector(by(df2$value + df2$sd, factor(df2$Var1), max))) * .03


p + ggplot2::annotate("text", x = 1:5,
                    y = y0, 
                    label = lab)

plevel <- function(x) {
  if(x < 1e-04) {
    "****"
  } else {
    if(x < 0.001) {
      "***"
    } else {
      if(x < 0.01) {
        "**"
      } else {
        if(x < 0.05) {
          "*"
        } else {
          "ns"
        }
      }
    }
  }
}
sapply(lab, plevel)
##      Antisense         lncRNA          ncRNA Protein coding     Pseudogene 
##         "****"         "****"         "****"         "****"           "ns"
p + ggplot2::annotate("text", x = 1:5,
                      y = y0, 
                      label = sapply(lab, plevel))

Percentage

Total_gene_biotype <- mapply(function(x) table(gtf_gene[x[[1]],]$gene_biotype), Gene_Total)
PolyA_gene_biotype <- mapply(function(x) table(gtf_gene[x[[1]],]$gene_biotype), Gene_PolyA)

Total_gene_biotype <- apply(Total_gene_biotype, 2, function(x) x/sum(x)*100)
PolyA_gene_biotype <- apply(PolyA_gene_biotype, 2, function(x) x/sum(x)*100)

Total_gene_biotype <- data.frame(reshape2::melt(Total_gene_biotype))
PolyA_gene_biotype <- data.frame(reshape2::melt(PolyA_gene_biotype))
Total_gene_biotype$Library <- "Total"
PolyA_gene_biotype$Library <- "PolyA"
gene_biotype <- rbind(PolyA_gene_biotype, Total_gene_biotype)

df2 <- data_summary(gene_biotype, varname="value", 
                    groupnames=c("Library", "Var1"))
ggplot(df2, aes(x=Var1, y=value, fill=Library)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_errorbar(aes(ymin=value-sd, ymax=value+sd), width=.2,
                position=position_dodge(.9)) +
  labs(x = "Biotype", y = "Percentage of gene")+
  theme_classic() +
  scale_fill_manual(values=c(col.p, col.t))+
  guides(fill = FALSE)+
  theme(axis.title = element_text(size = 16), 
        axis.text = element_text(size = 12), 
        axis.text.x = element_text(angle = 22, hjust = 1)) -> p

tmp <- with(gene_biotype, by(gene_biotype, Var1, function(x) t.test(value ~ Library, data = x)))
(lab <- signif(sapply(tmp, function(x) x$p.value), 2))
##      Antisense         lncRNA          ncRNA Protein coding     Pseudogene 
##        6.6e-41        2.1e-15        8.2e-43        4.1e-01        6.1e-04
y0 = as.vector(by(df2$value + df2$sd, factor(df2$Var1), max)) + 
  max(as.vector(by(df2$value + df2$sd, factor(df2$Var1), max))) * .03

p + ggplot2::annotate("text", x = 1:5,
                    y = y0, 
                    label = lab)

sapply(lab, plevel)
##      Antisense         lncRNA          ncRNA Protein coding     Pseudogene 
##         "****"         "****"         "****"           "ns"          "***"
p + ggplot2::annotate("text", x = 1:5,
                      y = y0, 
                      label = sapply(lab, plevel))

C. Identified transcript number

PolyA

setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_PolyA <- paste(SampInfo_PolyA$ID , ".isoforms.results", sep = "")
stopifnot(all(file.exists(files_PolyA)))
Gene_PolyA <- lapply(files_PolyA, fread, select = c(1, 5, 6))

filtering

Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[, transcript_id:=substr(transcript_id, 1, 15)]})
PolyA_gene <- unique(substr(do.call(rbind, Gene_PolyA)[[1]], 1, 15))

Total

setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_Total <- paste(SampInfo_Total$ID , ".isoforms.results", sep = "")
stopifnot(all(file.exists(files_Total)))
Gene_Total <- lapply(files_Total, fread, select = c(1, 5, 6))

filtering

Gene_Total <- lapply(Gene_Total, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_Total <- lapply(Gene_Total, function(x) {x[, transcript_id:=substr(transcript_id, 1, 15)]})
Total_gene <- unique(substr(do.call(rbind, Gene_Total)[[1]], 1, 15))
Mat <- rbind(data.frame(Gene = mapply(nrow, Gene_PolyA), Library = "PolyA"), data.frame(Gene = mapply(nrow, Gene_Total), Library = "Total"))

library(ggpubr)
my_comparisons = list(c("PolyA", "Total"))
ggviolin(Mat, x = "Library", y = "Gene", fill = "Library",
         palette = c(col.p, col.t),
         add = "boxplot", 
         add.params = list(fill = "white"))+
  stat_compare_means(comparisons = my_comparisons, label = "p.signif", method = "t.test")+ # Add significance levels
  stat_compare_means(method = "t.test", label.x.npc = "center")+
  guides(fill = FALSE)+
  theme(axis.title = element_text(size = 16))+
  labs(y = "No. isoform")

ggviolin(Mat, x = "Library", y = "Gene", fill = "Library",
         palette = c(col.p, col.t),
         add = "boxplot", 
         add.params = list(fill = "white"))+
  stat_compare_means(label = "p.format", comparisons = my_comparisons, method = "t.test")+ # Add significance levelsaes(label = paste0("p = ", ..p.format..))
  guides(fill = FALSE)+
  theme(axis.title = element_text(size = 16))+
  labs(y = "No. isoform")

input = list(PolyA = PolyA_gene, Total = Total_gene)

lab1 <- paste(sum(!input[[1]] %in% input[[2]]), "\n(", round(mean(!input[[1]] %in% input[[2]]) * 100, 2), "%)", sep = "")
lab2 <- length(intersect(input[[1]], input[[2]]))
lab3 <- paste(sum(!input[[2]] %in% input[[1]]), "\n(", round(mean(!input[[2]] %in% input[[1]]) * 100, 2), "%)", sep = "")

library(ggplot2)
library(ggforce)
library(cowplot)
ggplot() + geom_circle(aes(x0 = c(-1, 1), 
                           y0 = c(0, 0), 
                           r = c(2, 2), 
                           color = c(col.p, col.t), 
                           fill = c(col.p, col.t)), 
                       lwd = 1.5, 
                       alpha = .1)+
  guides(color = F, fill = F, alpha = F)+
  scale_fill_manual(values = c(col.p, col.t)) +
  scale_color_manual(values = c(col.p, col.t)) +
  theme_nothing() + 
  ggplot2::annotate("text", x = c(-2, 0, 2), y = 0, label = c(lab1, lab2, lab3), size = 4) + 
  ggplot2::annotate("text", x = c(-1, 1), y = 2.2, label = names(input), size = 6)

D. Identified transcripts’ biotype

Total_gene_biotype <- mapply(function(x) table(gtf_tx[x[[1]],]$transcript_biotype), Gene_Total)
PolyA_gene_biotype <- mapply(function(x) table(gtf_tx[x[[1]],]$transcript_biotype), Gene_PolyA)

Total_gene_biotype <- data.frame(reshape2::melt(Total_gene_biotype))
PolyA_gene_biotype <- data.frame(reshape2::melt(PolyA_gene_biotype))
Total_gene_biotype$Library <- "Total"
PolyA_gene_biotype$Library <- "PolyA"
gene_biotype <- rbind(PolyA_gene_biotype, Total_gene_biotype)

df2 <- data_summary(gene_biotype, varname="value", 
                    groupnames=c("Library", "Var1"))
ggplot(df2, aes(x=Var1, y=value, fill=Library)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_errorbar(aes(ymin=value-sd, ymax=value+sd), width=.2,
                position=position_dodge(.9)) +
  labs(x = "Biotype", y = "No. isoform")+
  theme_classic() +
  scale_fill_manual(values=c(col.p, col.t))+
  guides(fill = FALSE)+
  theme(axis.title = element_text(size = 16), 
        axis.text = element_text(size = 12), 
        axis.text.x = element_text(angle = 22, hjust = 1)) -> p

tmp <- with(gene_biotype, by(gene_biotype, Var1, function(x) t.test(value ~ Library, data = x)))
(lab <- signif(sapply(tmp, function(x) x$p.value), 2))
##      Antisense         lncRNA          ncRNA          Other Protein coding 
##        6.6e-17        2.3e-04        5.9e-08        4.8e-04        7.7e-01 
##     Pseudogene 
##        1.9e-01
y0 = as.vector(by(df2$value + df2$sd, factor(df2$Var1), max)) + 
  max(as.vector(by(df2$value + df2$sd, factor(df2$Var1), max))) * .03


p + ggplot2::annotate("text", x = 1:6,
                    y = y0, 
                    label = lab)

sapply(lab, plevel)
##      Antisense         lncRNA          ncRNA          Other Protein coding 
##         "****"          "***"         "****"          "***"           "ns" 
##     Pseudogene 
##           "ns"
p + ggplot2::annotate("text", x = 1:6,
                      y = y0, 
                      label = sapply(lab, plevel))

Percentage

Total_gene_biotype <- mapply(function(x) table(gtf_tx[x[[1]],]$transcript_biotype), Gene_Total)
PolyA_gene_biotype <- mapply(function(x) table(gtf_tx[x[[1]],]$transcript_biotype), Gene_PolyA)

Total_gene_biotype <- apply(Total_gene_biotype, 2, function(x) x/sum(x)*100)
PolyA_gene_biotype <- apply(PolyA_gene_biotype, 2, function(x) x/sum(x)*100)

Total_gene_biotype <- data.frame(reshape2::melt(Total_gene_biotype))
PolyA_gene_biotype <- data.frame(reshape2::melt(PolyA_gene_biotype))
Total_gene_biotype$Library <- "Total"
PolyA_gene_biotype$Library <- "PolyA"
gene_biotype <- rbind(PolyA_gene_biotype, Total_gene_biotype)

df2 <- data_summary(gene_biotype, varname="value", 
                    groupnames=c("Library", "Var1"))
ggplot(df2, aes(x=Var1, y=value, fill=Library)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_errorbar(aes(ymin=value-sd, ymax=value+sd), width=.2,
                position=position_dodge(.9)) +
  labs(x = "Biotype", y = "Percentage of isoform")+
  theme_classic() +
  scale_fill_manual(values=c(col.p, col.t))+
  guides(fill = FALSE)+
  theme(axis.title = element_text(size = 16), 
        axis.text = element_text(size = 12), 
        axis.text.x = element_text(angle = 22, hjust = 1)) -> p

tmp <- with(gene_biotype, by(gene_biotype, Var1, function(x) t.test(value ~ Library, data = x)))
(lab <- signif(sapply(tmp, function(x) x$p.value), 2))
##      Antisense         lncRNA          ncRNA          Other Protein coding 
##        1.1e-52        7.8e-11        2.9e-21        5.1e-22        1.9e-10 
##     Pseudogene 
##        4.4e-01
y0 = as.vector(by(df2$value + df2$sd, factor(df2$Var1), max)) + 
  max(as.vector(by(df2$value + df2$sd, factor(df2$Var1), max))) * .03

p + ggplot2::annotate("text", x = 1:6,
                    y = y0, 
                    label = lab)

sapply(lab, plevel)
##      Antisense         lncRNA          ncRNA          Other Protein coding 
##         "****"         "****"         "****"         "****"         "****" 
##     Pseudogene 
##           "ns"
p + ggplot2::annotate("text", x = 1:6,
                      y = y0, 
                      label = sapply(lab, plevel))

E. Correlation of different method quantification

PolyA

setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_PolyA <- paste(SampInfo_PolyA$ID , ".genes.results", sep = "")
stopifnot(all(file.exists(files_PolyA)))
Gene_PolyA <- lapply(files_PolyA, fread, select = c(1, 5, 6))
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[, gene_id:=substr(gene_id, 1, 15)]})

Total

setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_Total <- paste(SampInfo_Total$ID , ".genes.results", sep = "")
stopifnot(all(file.exists(files_Total)))
Gene_Total <- lapply(files_Total, fread, select = c(1, 5, 6))
Gene_Total <- lapply(Gene_Total, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_Total <- lapply(Gene_Total, function(x) {x[, gene_id:=substr(gene_id, 1, 15)]})
stopifnot(identical(SampInfo_PolyA$donor_id, SampInfo_Total$donor_id))
Gene_TPM <- list()
for(i in 1:40){
  tmp <- merge(Gene_PolyA[[i]][, c(1, 3)], Gene_Total[[i]][, c(1, 3)], by = "gene_id")
  colnames(tmp)[2:3] <- c("PolyA", "Total")
  Gene_TPM[[i]] <- tmp
}
gene_cor <- mapply(Gene_TPM, FUN = function(x) cor(log1p(x[[2]]), log1p(x[[3]])))

ggviolin(gene_cor,
         add = "boxplot", 
         fill = "#FC4E07", 
         alpha = .5, 
         add.params = list(fill = "white")) +
  ggbeeswarm::geom_quasirandom(size = 2, alpha = .5, col = "#FC4E07") +
  labs(y = "r")+
  theme_classic() +
  scale_fill_manual(values=c(col.p, col.t))+
  guides(fill = FALSE)+
  theme(axis.title = element_text(size = 22), 
        axis.text.y = element_text(size = 16), 
        axis.text.x = element_blank(), 
        axis.title.x = element_blank(), 
        axis.ticks.x = element_blank())

F. GO&KEGG enrichment of method-specific genes

Biotype of Library-specific isoforms

PolyA_Spe_isoform <- input$PolyA[!input$PolyA %in% input$Total]
Total_Spe_isoform <- input$Total[!input$Total %in% input$PolyA]
table(gtf_tx[PolyA_Spe_isoform,]$transcript_biotype)
## 
##      Antisense         lncRNA          ncRNA          Other Protein coding 
##           1286            487           1275           2244           5816 
##     Pseudogene 
##            632
table(gtf_tx[Total_Spe_isoform,]$transcript_biotype)
## 
##      Antisense         lncRNA          ncRNA          Other Protein coding 
##            413            646           2687           2791           3772 
##     Pseudogene 
##            545
table(gtf_tx[Total_Spe_isoform,]$transcript_biotype)/length(Total_Spe_isoform)
## 
##      Antisense         lncRNA          ncRNA          Other Protein coding 
##     0.03805049     0.05951723     0.24755850     0.25714022     0.34752165 
##     Pseudogene 
##     0.05021190
table(gtf_tx[PolyA_Spe_isoform,]$transcript_biotype)/length(PolyA_Spe_isoform)
## 
##      Antisense         lncRNA          ncRNA          Other Protein coding 
##     0.10954003     0.04148211     0.10860307     0.19114140     0.49540034 
##     Pseudogene 
##     0.05383305
df <- data.frame(
  group = names(table(gtf_tx[PolyA_Spe_isoform,]$transcript_biotype)),
  value = as.numeric(table(gtf_tx[PolyA_Spe_isoform,]$transcript_biotype))
)
df
##            group value
## 1      Antisense  1286
## 2         lncRNA   487
## 3          ncRNA  1275
## 4          Other  2244
## 5 Protein coding  5816
## 6     Pseudogene   632
library(ggplot2)
# Barplot
bp<- ggplot(df, aes(x="", y=value, fill=group))+
  geom_bar(width = 1, stat = "identity")
pie <- bp + coord_polar("y", start=0)
# use brewer color palettes
blank_theme <- theme_minimal()+
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.border = element_blank(),
    panel.grid=element_blank(),
    axis.ticks = element_blank(),
    plot.title=element_text(size=14, face="bold")
  )
# Use brewer palette
pie + 
  scale_fill_brewer(palette="Dark2") + 
  blank_theme +
  theme(axis.text.x = element_blank())+
  geom_text(data = df, aes(x = 1.3, y = c(0, cumsum(rev(value))[1:5]) + rev(value)/2, 
                           label = rev(paste0(round(value/sum(value)*100, 2), "%"))), size = 3)

library(plotly)
## 
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- plot_ly(df, labels = ~group, values = ~value, type = 'pie') %>%
  layout(title = 'PolyA',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
df <- data.frame(
  group = names(table(gtf_tx[Total_Spe_isoform,]$transcript_biotype)),
  value = as.numeric(table(gtf_tx[Total_Spe_isoform,]$transcript_biotype))
)
df
##            group value
## 1      Antisense   413
## 2         lncRNA   646
## 3          ncRNA  2687
## 4          Other  2791
## 5 Protein coding  3772
## 6     Pseudogene   545
# Barplot
bp<- ggplot(df, aes(x="", y=value, fill=group))+
  geom_bar(width = 1, stat = "identity")
pie <- bp + coord_polar("y", start=0)
# use brewer color palettes
blank_theme <- theme_minimal()+
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.border = element_blank(),
    panel.grid=element_blank(),
    axis.ticks = element_blank(),
    plot.title=element_text(size=14, face="bold")
  )
# Use brewer palette
pie + 
  scale_fill_brewer(palette="Dark2") + 
  blank_theme +
  theme(axis.text.x = element_blank())+
  geom_text(data = df, aes(x = 1.3, y = c(0, cumsum(rev(value))[1:5]) + rev(value)/2, 
                           label = rev(paste0(round(value/sum(value)*100, 2), "%"))), size = 3)

library(plotly)
p <- plot_ly(df, labels = ~group, values = ~value, type = 'pie') %>%
  layout(title = 'Total',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
df_PolyA <- data.frame(
  group = names(table(gtf_tx[PolyA_Spe_isoform,]$transcript_biotype)),
  value = as.numeric(table(gtf_tx[PolyA_Spe_isoform,]$transcript_biotype))/length(PolyA_Spe_isoform),
  Library = "PolyA")

df_Total <- data.frame(
  group = names(table(gtf_tx[Total_Spe_isoform,]$transcript_biotype)),
  value = as.numeric(table(gtf_tx[Total_Spe_isoform,]$transcript_biotype))/length(Total_Spe_isoform),
  Library = "Total")

df <- rbind(df_PolyA, df_Total)
ggplot(df, aes(group, value, fill = Library)) +
  geom_col(position=position_dodge())+
  # coord_flip() +
  labs(x = "Biotype", y = "Percentage of isoform", title = "Biotype of Library-specific isoforms")+
  theme_classic() +
  # scale_y_continuous(breaks = c(-0.4, -0.2, 0.0, 0.2), label = c("0.4", "0.2", "0.0", "0.2"))+
  scale_fill_manual(values=c(col.p, col.t))+
  # guides(fill = FALSE)+
  theme(axis.title = element_text(size = 16), 
        axis.text = element_text(size = 12), 
        # axis.text.x = element_text(angle = 22, hjust = 1), 
        legend.position = "top")

Biotype of Library-specific genes

setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_PolyA <- paste(SampInfo_PolyA$ID , ".genes.results", sep = "")
stopifnot(all(file.exists(files_PolyA)))
Gene_PolyA <- lapply(files_PolyA, fread, select = c(1, 5, 6))

# filtering
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[, gene_id:=substr(gene_id, 1, 15)]})
PolyA_gene <- unique(substr(do.call(rbind, Gene_PolyA)[[1]], 1, 15))
setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_Total <- paste(SampInfo_Total$ID , ".genes.results", sep = "")
stopifnot(all(file.exists(files_Total)))
Gene_Total <- lapply(files_Total, fread, select = c(1, 5, 6))

# filtering
Gene_Total <- lapply(Gene_Total, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_Total <- lapply(Gene_Total, function(x) {x[, gene_id:=substr(gene_id, 1, 15)]})
Total_gene <- unique(substr(do.call(rbind, Gene_Total)[[1]], 1, 15))
input = list(PolyA = PolyA_gene, Total = Total_gene)
PolyA_Spe_gene <- input$PolyA[!input$PolyA %in% input$Total]
Total_Spe_gene <- input$Total[!input$Total %in% input$PolyA]
table(gtf_gene[PolyA_Spe_gene,]$gene_biotype)
## 
##      Antisense         lncRNA          ncRNA Protein coding     Pseudogene 
##            924            275             63           1185            644
table(gtf_gene[Total_Spe_gene,]$gene_biotype)
## 
##      Antisense         lncRNA          ncRNA Protein coding     Pseudogene 
##            150            432            243            327            535
table(gtf_gene[PolyA_Spe_gene,]$gene_biotype)/length(PolyA_Spe_gene)
## 
##      Antisense         lncRNA          ncRNA Protein coding     Pseudogene 
##     0.29893238     0.08896797     0.02038175     0.38337108     0.20834681
table(gtf_gene[Total_Spe_gene,]$gene_biotype)/length(Total_Spe_gene)
## 
##      Antisense         lncRNA          ncRNA Protein coding     Pseudogene 
##     0.08891523     0.25607587     0.14404268     0.19383521     0.31713100
df <- data.frame(
  group = names(table(gtf_gene[PolyA_Spe_gene,]$gene_biotype)),
  value = as.numeric(table(gtf_gene[PolyA_Spe_gene,]$gene_biotype))
)
df
##            group value
## 1      Antisense   924
## 2         lncRNA   275
## 3          ncRNA    63
## 4 Protein coding  1185
## 5     Pseudogene   644
library(ggplot2)
# Barplot
bp<- ggplot(df, aes(x="", y=value, fill=group))+
  geom_bar(width = 1, stat = "identity")
pie <- bp + coord_polar("y", start=0)
# use brewer color palettes
blank_theme <- theme_minimal()+
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.border = element_blank(),
    panel.grid=element_blank(),
    axis.ticks = element_blank(),
    plot.title=element_text(size=14, face="bold")
  )
# Use brewer palette
pie + 
  scale_fill_brewer(palette="Dark2") + 
  blank_theme +
  theme(axis.text.x = element_blank())+
  geom_text(data = df, aes(x = 1.3, y = c(0, cumsum(rev(value))[1:4]) + rev(value)/2, 
                           label = rev(paste0(round(value/sum(value)*100, 2), "%"))), size = 3)

library(plotly)
p <- plot_ly(df, labels = ~group, values = ~value, type = 'pie') %>%
  layout(title = 'PolyA',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
df <- data.frame(
  group = names(table(gtf_gene[Total_Spe_gene,]$gene_biotype)),
  value = as.numeric(table(gtf_gene[Total_Spe_gene,]$gene_biotype))
)
df
##            group value
## 1      Antisense   150
## 2         lncRNA   432
## 3          ncRNA   243
## 4 Protein coding   327
## 5     Pseudogene   535
# Barplot
bp<- ggplot(df, aes(x="", y=value, fill=group))+
  geom_bar(width = 1, stat = "identity")
pie <- bp + coord_polar("y", start=0)
# use brewer color palettes
blank_theme <- theme_minimal()+
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.border = element_blank(),
    panel.grid=element_blank(),
    axis.ticks = element_blank(),
    plot.title=element_text(size=14, face="bold")
  )
# Use brewer palette
pie + 
  scale_fill_brewer(palette="Dark2") + 
  blank_theme +
  theme(axis.text.x = element_blank())+
  geom_text(data = df, aes(x = 1.3, y = c(0, cumsum(rev(value))[1:4]) + rev(value)/2, 
                           label = rev(paste0(round(value/sum(value)*100, 2), "%"))), size = 3)

library(plotly)
p <- plot_ly(df, labels = ~group, values = ~value, type = 'pie') %>%
  layout(title = 'Total',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
df_PolyA <- data.frame(
  group = names(table(gtf_gene[PolyA_Spe_gene,]$gene_biotype)),
  value = -as.numeric(table(gtf_gene[PolyA_Spe_gene,]$gene_biotype))/length(PolyA_Spe_gene),
  Library = "PolyA")

df_Total <- data.frame(
  group = names(table(gtf_gene[Total_Spe_gene,]$gene_biotype)),
  value = as.numeric(table(gtf_gene[Total_Spe_gene,]$gene_biotype))/length(Total_Spe_gene),
  Library = "Total")

df <- rbind(df_PolyA, df_Total)
ggplot(df, aes(group, value, fill = Library)) +
  geom_col()+
  coord_flip() +
  labs(x = "Biotype", y = "Percentage of gene", title = "Biotype of Library-specific genes")+
  theme_classic() +
  scale_y_continuous(breaks = c(-0.4, -0.2, 0.0, 0.2), label = c("0.4", "0.2", "0.0", "0.2"))+
  scale_fill_manual(values=c(col.p, col.t))+
  # guides(fill = FALSE)+
  theme(axis.title = element_text(size = 16), 
        axis.text = element_text(size = 12), 
        # axis.text.x = element_text(angle = 22, hjust = 1), 
        legend.position = "top")

GO of Library-specific genes

suppressPackageStartupMessages(library(clusterProfiler))
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unsplit, which,
##     which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:plotly':
## 
##     rename
## The following object is masked from 'package:plyr':
## 
##     rename
## The following objects are masked from 'package:data.table':
## 
##     first, second
## The following object is masked from 'package:base':
## 
##     expand.grid
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:plotly':
## 
##     slice
## The following object is masked from 'package:plyr':
## 
##     desc
## The following object is masked from 'package:data.table':
## 
##     shift
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:plotly':
## 
##     select
## 
PolyA_Spe_gene_ego <- enrichGO(gene          = PolyA_Spe_gene,
                             # universe      = names(geneList),
                               OrgDb         = org.Hs.eg.db,
                               keyType       = "ENSEMBL",
                               ont           = "ALL",
                               pAdjustMethod = "BH",
                               pvalueCutoff  = 0.01,
                               qvalueCutoff  = 0.05,
                               readable      = TRUE)

Total_Spe_gene_ego <- enrichGO(gene          = Total_Spe_gene,
                             # universe      = names(geneList),
                               OrgDb         = org.Hs.eg.db,
                               keyType       = "ENSEMBL",
                               ont           = "ALL",
                               pAdjustMethod = "BH",
                               pvalueCutoff  = 0.01,
                               qvalueCutoff  = 0.05,
                               readable      = TRUE)
dotplot(PolyA_Spe_gene_ego)
## wrong orderBy parameter; set to default `orderBy = "x"`

dotplot(Total_Spe_gene_ego)
## wrong orderBy parameter; set to default `orderBy = "x"`

gene_list = list(PolyA = as.character(bitr(PolyA_Spe_gene, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID), 
                 Total = as.character(bitr(Total_Spe_gene, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID))
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
ck <- compareCluster(gene_list, fun = "enrichKEGG", organism="hsa")
dotplot(ck)

gene <- as.character(bitr(PolyA_Spe_gene, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID)
## 'select()' returned 1:many mapping between keys and columns
PolyA_Spe_gene_kk <- enrichKEGG(gene         = gene,
                                organism     = 'hsa',
                                pvalueCutoff = 0.05)
gene <- as.character(bitr(Total_Spe_gene, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID)
## 'select()' returned 1:many mapping between keys and columns
Total_Spe_gene_kk <- enrichKEGG(gene         = gene,
                                organism     = 'hsa',
                                pvalueCutoff = 0.05)
if(nrow(PolyA_Spe_gene_kk) > 0) dotplot(PolyA_Spe_gene_kk)
## wrong orderBy parameter; set to default `orderBy = "x"`

if(nrow(Total_Spe_gene_kk) > 0) dotplot(Total_Spe_gene_kk)
## wrong orderBy parameter; set to default `orderBy = "x"`

xx2 <- compareCluster(geneClusters = list(PolyA = PolyA_Spe_gene, Total = Total_Spe_gene), 
                      fun="enrichGO", 
                      OrgDb  = org.Hs.eg.db, 
                      keyType= 'ENSEMBL', 
                      ont  = "ALL", 
                      pAdjustMethod = "BH")
dotplot(xx2)